import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from ydata_profiling import ProfileReport
import seaborn as sns
import plotly.express as px
from statsmodels.graphics.mosaicplot import mosaic
import lightgbm as lgb
from lightgbm import LGBMClassifier, LGBMRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.inspection import permutation_importance
from sklearn.ensemble import IsolationForest
from sklearn.experimental import enable_iterative_imputer
from sklearn.compose import ColumnTransformer
from sklearn.impute import IterativeImputer, SimpleImputer, KNNImputer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, roc_curve
from sklearn.model_selection import KFold, StratifiedKFold, train_test_split, cross_val_score, TimeSeriesSplit
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder, OneHotEncoder, TargetEncoder
import optuna
import utils
%matplotlib inline
pd.set_option('display.max_rows', 50)
pd.set_option('display.max_columns', 50)
df = pd.read_csv("weatherAUS_extended.csv")
df.head(2)
| Date | Location | MinTemp | MaxTemp | Rainfall | Evaporation | Sunshine | WindGustDir | WindGustSpeed | WindDir9am | WindDir3pm | WindSpeed9am | WindSpeed3pm | Humidity9am | Humidity3pm | Pressure9am | Pressure3pm | Cloud9am | Cloud3pm | Temp9am | Temp3pm | RainToday | RainTomorrow | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2008-12-01 | Albury | 13.4 | 22.9 | 0.6 | NaN | NaN | W | 44.0 | W | WNW | 20.0 | 24.0 | 71.0 | 22.0 | 1007.7 | 1007.1 | 8.0 | NaN | 16.9 | 21.8 | 0 | 0 |
| 1 | 2008-12-02 | Albury | 7.4 | 25.1 | 0.0 | NaN | NaN | WNW | 44.0 | NNW | WSW | 4.0 | 22.0 | 44.0 | 25.0 | 1010.6 | 1007.8 | 8.0 | NaN | 17.2 | 24.3 | 0 | 0 |
df.shape
(149408, 23)
df[df['RainTomorrow'].isna()].shape
(0, 23)
plot_df = df.copy()
plot_df['RainTomorrow']= plot_df['RainTomorrow'].map({1:'rain', 0: 'no rain'})
sns.histplot(plot_df['RainTomorrow'], stat='percent')
<AxesSubplot:xlabel='RainTomorrow', ylabel='Percent'>
class#1 – it will rain tomorrow (at least 1mm)
class#0 – it won't rain tomorrow (less 1 mm).
F1-score looks like trade-off, because there is no additional information about costs of FP/FN cases, and also target is slightly unbalanced.
# Quick review
utils.calc_df_info(df)
| column | dtype | non_null_count | nunique | values | |
|---|---|---|---|---|---|
| 0 | Date | object | 149408 | 3523 | ['2008-12-01' '2008-12-02' '2008-12-03' ... '2... |
| 1 | Location | object | 149408 | 49 | ['Albury' 'BadgerysCreek' 'Cobar' 'CoffsHarbou... |
| 2 | MinTemp | float64 | 149408 | 389 | [13.4 7.4 12.9 9.2 17.5 14.6 14.3 7.7 9.7 ... |
| 3 | MaxTemp | float64 | 149407 | 505 | [22.9 25.1 25.7 28. 32.3 29.7 25. 26.7 31.9 ... |
| 4 | Rainfall | float64 | 149408 | 681 | [6.000e-01 0.000e+00 1.000e+00 2.000e-01 1.400... |
| 5 | Evaporation | float64 | 102662 | 358 | [ nan 1.20e+01 1.48e+01 1.26e+01 1.08e+01 ... |
| 6 | Sunshine | float64 | 94835 | 145 | [ nan 12.3 13. 13.3 10.6 12.2 8.4 0. 12.6 ... |
| 7 | WindGustDir | object | 142032 | 16 | ['W' 'WNW' 'WSW' 'NE' 'NNW' 'N' 'NNE' 'SW' 'EN... |
| 8 | WindGustSpeed | float64 | 142032 | 67 | [ 44. 46. 24. 41. 56. 50. 35. 80. 28. ... |
| 9 | WindDir9am | object | 149405 | 16 | ['W' 'NNW' 'SE' 'ENE' 'SW' 'SSE' 'S' 'NE' 'SSW... |
| 10 | WindDir3pm | object | 149406 | 16 | ['WNW' 'WSW' 'E' 'NW' 'W' 'SSE' 'ESE' 'ENE' 'N... |
| 11 | WindSpeed9am | float64 | 149407 | 43 | [ 20. 4. 19. 11. 7. 6. 15. 17. 28. ... |
| 12 | WindSpeed3pm | float64 | 149406 | 44 | [24. 22. 26. 9. 20. 17. 28. 11. 6. 13. 30. 1... |
| 13 | Humidity9am | float64 | 149388 | 101 | [ 71. 44. 38. 45. 82. 55. 49. 48. 42. ... |
| 14 | Humidity3pm | float64 | 149388 | 101 | [ 22. 25. 30. 16. 33. 23. 19. 9. 27. ... |
| 15 | Pressure9am | float64 | 136931 | 546 | [1007.7 1010.6 1007.6 1017.6 1010.8 1009.2 100... |
| 16 | Pressure3pm | float64 | 136930 | 549 | [1007.1 1007.8 1008.7 1012.8 1006. ... |
| 17 | Cloud9am | float64 | 109166 | 10 | [ 8. 7. 1. 0. 5. 4. 2. 6. 3. nan 9.] |
| 18 | Cloud3pm | float64 | 110573 | 10 | [nan 2. 8. 7. 1. 5. 4. 6. 3. 0. 9.] |
| 19 | Temp9am | float64 | 149407 | 441 | [16.9 17.2 21. 18.1 17.8 20.6 16.3 18.3 20.1 ... |
| 20 | Temp3pm | float64 | 149406 | 502 | [21.8 24.3 23.2 26.5 29.7 28.9 24.6 25.5 30.2 ... |
| 21 | RainToday | int64 | 149408 | 2 | [0 1] |
| 22 | RainTomorrow | int64 | 149408 | 2 | [0 1] |
# Plot columns with missing values
missing = df.isnull().sum()
missing = missing[missing > 0]
if len(missing)>0:
missing.sort_values(inplace=True)
missing.plot.bar()
# Drop columns with significant share NaN (more > 30%)
df = df.drop(columns=['Sunshine','Evaporation','Cloud9am','Cloud3pm'])
def compass_to_degrees(compass):
compass_points = {
'N': 0,
'NNE': 22.5,
'NE': 45,
'ENE': 67.5,
'E': 90,
'ESE': 112.5,
'SE': 135,
'SSE': 157.5,
'S': 180,
'SSW': 202.5,
'SW': 225,
'WSW': 247.5,
'W': 270,
'WNW': 292.5,
'NW': 315,
'NNW': 337.5
}
return compass_points.get(compass, None)
# Change formats
df["Date"] = pd.to_datetime(df["Date"])
df["WindGustDir"] = df["WindGustDir"].apply(compass_to_degrees)
df["WindDir9am"] = df["WindDir9am"].apply(compass_to_degrees)
df["WindDir3pm"] = df["WindDir3pm"].apply(compass_to_degrees)
df = utils.add_time_features(df, date="Date")
C:\Users\Aleksei_Bolshukhin\02_PASTM\AB\utils.py:297: FutureWarning: Series.dt.weekofyear and Series.dt.week have been deprecated. Please use Series.dt.isocalendar().week instead. df['week_feature'] = df[date].dt.week
df.head(2)
| Date | Location | MinTemp | MaxTemp | Rainfall | WindGustDir | WindGustSpeed | WindDir9am | WindDir3pm | WindSpeed9am | WindSpeed3pm | Humidity9am | Humidity3pm | Pressure9am | Pressure3pm | Temp9am | Temp3pm | RainToday | RainTomorrow | quarter_feature | month_feature | week_feature | day_feature | dayofweek_feature | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2008-12-01 | Albury | 13.4 | 22.9 | 0.6 | 270.0 | 44.0 | 270.0 | 292.5 | 20.0 | 24.0 | 71.0 | 22.0 | 1007.7 | 1007.1 | 16.9 | 21.8 | 0 | 0 | 4 | 12 | 49 | 1 | 0 |
| 1 | 2008-12-02 | Albury | 7.4 | 25.1 | 0.0 | 292.5 | 44.0 | 337.5 | 247.5 | 4.0 | 22.0 | 44.0 | 25.0 | 1010.6 | 1007.8 | 17.2 | 24.3 | 0 | 0 | 4 | 12 | 49 | 2 | 1 |
plot_df["Date"] = pd.to_datetime(plot_df["Date"])
plot_df["Month_Date"] = plot_df["Date"].apply(utils.get_month_start_date)
utils.plot_TS_groups(plot_df, 'Month_Date', y_list=['Rainfall'])
There were quite strong seasonal patterns of Rainfall in previous years (2009-2012) - with high season in December-January. But since 2013 the most peakes shifted into Apr-Jun. So one idea for future is using not all history of observations, for example to cut years before 2014.
1) 12 last months will be test set to test model's performance in different seasons.
2) previous 3 years will be train set which reflect latest trends.
test_df = df[df["Date"] > "2016-06-24"]
test_df.shape
(17836, 24)
df = df.loc[(~df.index.isin(test_df.index)) & (df["Date"]>="2013-06-24")]
df = df.sort_values(by=["Date"])
df.shape
(53753, 24)
target = 'RainTomorrow'
numerical_features = utils.get_numerical_features(df.drop(columns=[target]))
categorical_features = utils.get_categorical_features(df.drop(columns=[target]))
profile = ProfileReport(df,
title="Pandas Profiling Report",
correlations={
"auto": {"calculate": False},
"pearson": {"calculate": True},
"spearman": {"calculate": True},
"kendall": {"calculate": True},
"phi_k": {"calculate": True},
"cramers": {"calculate": True}},
interactions = None
)
profile.to_file("data_report.html")
Summarize dataset: 0%| | 0/5 [00:00<?, ?it/s]
C:\Users\Aleksei_Bolshukhin\AppData\Roaming\Python\Python39\site-packages\ydata_profiling\model\pandas\correlations_pandas.py:36: FutureWarning: The default value of numeric_only in DataFrame.corr is deprecated. In a future version, it will default to False. Select only valid columns or specify the value of numeric_only to silence this warning. C:\Users\Aleksei_Bolshukhin\AppData\Roaming\Python\Python39\site-packages\ydata_profiling\model\pandas\correlations_pandas.py:29: FutureWarning: The default value of numeric_only in DataFrame.corr is deprecated. In a future version, it will default to False. Select only valid columns or specify the value of numeric_only to silence this warning. C:\Users\Aleksei_Bolshukhin\AppData\Roaming\Python\Python39\site-packages\ydata_profiling\model\pandas\correlations_pandas.py:43: FutureWarning: The default value of numeric_only in DataFrame.corr is deprecated. In a future version, it will default to False. Select only valid columns or specify the value of numeric_only to silence this warning.
Generate report structure: 0%| | 0/1 [00:00<?, ?it/s]
Render HTML: 0%| | 0/1 [00:00<?, ?it/s]
Export report to file: 0%| | 0/1 [00:00<?, ?it/s]
profile
plot_df = df.copy()
plot_df['RainTomorrow']= plot_df['RainTomorrow'].map({1:'rain', 0: 'no rain'})
for col in numerical_features:
utils.plot_hist_for_classes(col, plot_df, target)
for col in categorical_features:
utils.plot_bar_for_classes(col, plot_df, target)
1) High attention to Humidity and RainToday - the most correlated to target.
Also, idea for future - exclude some redundant variables which are correlated between each other.
2) There is a difference betweeen locations. Idea for future - separated models for the top and the flop “rainy” groups.
# set metrics for current task
metrics = ['accuracy', 'precision', 'recall', 'f1']
X_train, X_test, y_train, y_test = df, test_df, df[target], test_df[target]
1) KNNImputer for numerical
2) SimpleImputer for categorical
3) RobustScaler to mitigate influence of potential outliers
def prepare_preprocessor(categ_features, num_features):
num_transformer = Pipeline(
steps=[
("imputer", KNNImputer()),
("scaler", RobustScaler())
]
)
categ_transformer = Pipeline(
steps=[
("imputer", SimpleImputer(strategy="most_frequent")),
("encoding", TargetEncoder())
]
)
preprocessor = ColumnTransformer(
transformers=[
("categ", categ_transformer, categ_features),
("num", num_transformer, num_features)
]
)
return preprocessor
preprocessor = prepare_preprocessor(categorical_features, numerical_features)
X_train_transformed = preprocessor.fit_transform(X_train, y_train)
X_test_transformed = preprocessor.transform(X_test)
model = RandomForestClassifier(random_state=33, n_jobs=-1)
model.fit(X_train_transformed, y_train)
RandomForestClassifier(n_jobs=-1, random_state=33)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
RandomForestClassifier(n_jobs=-1, random_state=33)
y_test_pred = model.predict(X_test_transformed)
# Metrics on test
utils.calc_metrics(y_test, y_test_pred, metrics)
{'accuracy': 0.8393698138596097,
'precision': 0.7361985920711375,
'recall': 0.47995169082125605,
'f1': 0.5810791051323292}
candidate_models = [
("LogisticRegression",
LogisticRegression(random_state=33, solver="saga", max_iter=10000),
{"C": optuna.distributions.FloatDistribution(1, 100, log=True),
"penalty": optuna.distributions.CategoricalDistribution(['l1', 'l2'])}),
("LGBMClassifier",
LGBMClassifier(random_state=33,n_jobs=-1),
{"max_depth": optuna.distributions.IntDistribution(3, 16),
"learning_rate": optuna.distributions.FloatDistribution(0.09, 0.19),
"feature_fraction": optuna.distributions.FloatDistribution(0.8, 1),
"bagging_fraction": optuna.distributions.FloatDistribution(0.8, 1),
"n_estimators": optuna.distributions.IntDistribution(100, 200, step=10),
"min_child_samples": optuna.distributions.IntDistribution(4, 16)}),
("RandomForest",
RandomForestClassifier(random_state=33,n_jobs=-1),
{"max_depth": optuna.distributions.IntDistribution(3, 16),
"max_features": optuna.distributions.FloatDistribution(0.8, 1),
"max_samples": optuna.distributions.FloatDistribution(0.8, 1),
"n_estimators": optuna.distributions.IntDistribution(100, 200, step=10),
"min_samples_leaf": optuna.distributions.IntDistribution(4, 16)}),
("SVC",
SVC(random_state=33),
{"C": optuna.distributions.FloatDistribution(1, 100, log=True),
"kernel": optuna.distributions.CategoricalDistribution(['linear', 'rbf'])})
]
kfolds = 2
split = TimeSeriesSplit(n_splits=kfolds)
scoring = 'f1'
metrics_cv, metrics_test, best_estimator = utils.tune_and_compare_models(
candidate_models,
X_train_transformed, y_train, X_test_transformed, y_test,
split, scoring, metrics
)
C:\Users\Aleksei_Bolshukhin\02_PASTM\AB\utils.py:189: ExperimentalWarning: OptunaSearchCV is experimental (supported from v0.17.0). The interface can change in the future. [I 2024-05-05 17:31:39,939] A new study created in memory with name: no-name-491f4b16-9242-4095-8a42-8033bb76ddb7 [I 2024-05-05 17:32:08,318] Trial 0 finished with value: 0.5393123700008216 and parameters: {'C': 35.24751513627425, 'penalty': 'l2'}. Best is trial 0 with value: 0.5393123700008216. [I 2024-05-05 17:32:48,069] Trial 1 finished with value: 0.5393123700008216 and parameters: {'C': 30.55148661909737, 'penalty': 'l1'}. Best is trial 0 with value: 0.5393123700008216. [I 2024-05-05 17:33:31,526] Trial 2 finished with value: 0.5393123700008216 and parameters: {'C': 51.279828010644344, 'penalty': 'l1'}. Best is trial 0 with value: 0.5393123700008216. [I 2024-05-05 17:34:09,515] Trial 3 finished with value: 0.5393123700008216 and parameters: {'C': 38.17922077052242, 'penalty': 'l1'}. Best is trial 0 with value: 0.5393123700008216. [I 2024-05-05 17:34:46,895] Trial 4 finished with value: 0.5393123700008216 and parameters: {'C': 59.61506894998203, 'penalty': 'l1'}. Best is trial 0 with value: 0.5393123700008216. [I 2024-05-05 17:35:22,844] Trial 5 finished with value: 0.5391663408068679 and parameters: {'C': 2.5927513001734446, 'penalty': 'l1'}. Best is trial 0 with value: 0.5393123700008216. [I 2024-05-05 17:35:57,954] Trial 6 finished with value: 0.5393123700008216 and parameters: {'C': 84.3669378955689, 'penalty': 'l1'}. Best is trial 0 with value: 0.5393123700008216. [I 2024-05-05 17:36:25,373] Trial 7 finished with value: 0.5393123700008216 and parameters: {'C': 17.31408667456455, 'penalty': 'l2'}. Best is trial 0 with value: 0.5393123700008216. [I 2024-05-05 17:36:59,392] Trial 8 finished with value: 0.5391663408068679 and parameters: {'C': 2.899114822950016, 'penalty': 'l1'}. Best is trial 0 with value: 0.5393123700008216. [I 2024-05-05 17:37:26,640] Trial 9 finished with value: 0.5393123700008216 and parameters: {'C': 31.009585861280712, 'penalty': 'l2'}. Best is trial 0 with value: 0.5393123700008216.
LogisticRegression best_estimator: LogisticRegression(C=35.24751513627425, max_iter=10000, random_state=33,
solver='saga')
LogisticRegression cv_optimized_score: 0.5393123700008216
C:\Users\Aleksei_Bolshukhin\02_PASTM\AB\utils.py:199: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead. C:\Users\Aleksei_Bolshukhin\02_PASTM\AB\utils.py:207: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead. C:\Users\Aleksei_Bolshukhin\02_PASTM\AB\utils.py:189: ExperimentalWarning: OptunaSearchCV is experimental (supported from v0.17.0). The interface can change in the future. [I 2024-05-05 17:38:17,235] A new study created in memory with name: no-name-3c7b162a-7231-4258-aeb5-8d88f17c5332 [I 2024-05-05 17:38:17,654] Trial 0 finished with value: 0.5693230670397846 and parameters: {'max_depth': 5, 'learning_rate': 0.16837559480391695, 'feature_fraction': 0.8282716499885758, 'bagging_fraction': 0.9165233736909923, 'n_estimators': 110, 'min_child_samples': 8}. Best is trial 0 with value: 0.5693230670397846. [I 2024-05-05 17:38:18,239] Trial 1 finished with value: 0.5698629194220828 and parameters: {'max_depth': 9, 'learning_rate': 0.12616460170063337, 'feature_fraction': 0.8028511725293248, 'bagging_fraction': 0.8406201323716711, 'n_estimators': 100, 'min_child_samples': 5}. Best is trial 1 with value: 0.5698629194220828. [I 2024-05-05 17:38:18,835] Trial 2 finished with value: 0.5735497575738115 and parameters: {'max_depth': 14, 'learning_rate': 0.18116098087144944, 'feature_fraction': 0.8893084720102601, 'bagging_fraction': 0.970945399499953, 'n_estimators': 110, 'min_child_samples': 11}. Best is trial 2 with value: 0.5735497575738115. [I 2024-05-05 17:38:19,494] Trial 3 finished with value: 0.56907142750022 and parameters: {'max_depth': 5, 'learning_rate': 0.13035184136682032, 'feature_fraction': 0.970891274273127, 'bagging_fraction': 0.9823190014181395, 'n_estimators': 160, 'min_child_samples': 10}. Best is trial 2 with value: 0.5735497575738115. [I 2024-05-05 17:38:20,453] Trial 4 finished with value: 0.5750384705488667 and parameters: {'max_depth': 14, 'learning_rate': 0.12339248863293056, 'feature_fraction': 0.9777990040240463, 'bagging_fraction': 0.9961972749738366, 'n_estimators': 170, 'min_child_samples': 16}. Best is trial 4 with value: 0.5750384705488667. [I 2024-05-05 17:38:21,080] Trial 5 finished with value: 0.5680772788487014 and parameters: {'max_depth': 4, 'learning_rate': 0.1330330891695163, 'feature_fraction': 0.9625560967683834, 'bagging_fraction': 0.9162261877376088, 'n_estimators': 150, 'min_child_samples': 8}. Best is trial 4 with value: 0.5750384705488667. [I 2024-05-05 17:38:22,063] Trial 6 finished with value: 0.5696498611292844 and parameters: {'max_depth': 10, 'learning_rate': 0.14914485485735862, 'feature_fraction': 0.9986396508952229, 'bagging_fraction': 0.9449285017644187, 'n_estimators': 200, 'min_child_samples': 7}. Best is trial 4 with value: 0.5750384705488667. [I 2024-05-05 17:38:22,814] Trial 7 finished with value: 0.5722701509225001 and parameters: {'max_depth': 5, 'learning_rate': 0.13516961469731625, 'feature_fraction': 0.9929620069079523, 'bagging_fraction': 0.9532587957015701, 'n_estimators': 190, 'min_child_samples': 11}. Best is trial 4 with value: 0.5750384705488667. [I 2024-05-05 17:38:23,631] Trial 8 finished with value: 0.5701282443070068 and parameters: {'max_depth': 6, 'learning_rate': 0.13224263505006734, 'feature_fraction': 0.9548836440972831, 'bagging_fraction': 0.9140193457087088, 'n_estimators': 200, 'min_child_samples': 12}. Best is trial 4 with value: 0.5750384705488667. [I 2024-05-05 17:38:24,244] Trial 9 finished with value: 0.5670582461573623 and parameters: {'max_depth': 10, 'learning_rate': 0.10356222316406125, 'feature_fraction': 0.8979503756014569, 'bagging_fraction': 0.9075399019334413, 'n_estimators': 100, 'min_child_samples': 4}. Best is trial 4 with value: 0.5750384705488667.
LGBMClassifier best_estimator: LGBMClassifier(bagging_fraction=0.9961972749738366,
feature_fraction=0.9777990040240463,
learning_rate=0.12339248863293056, max_depth=14,
min_child_samples=16, n_estimators=170, random_state=33)
LGBMClassifier cv_optimized_score: 0.5750384705488667
C:\Users\Aleksei_Bolshukhin\02_PASTM\AB\utils.py:199: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead. C:\Users\Aleksei_Bolshukhin\02_PASTM\AB\utils.py:207: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead. C:\Users\Aleksei_Bolshukhin\02_PASTM\AB\utils.py:189: ExperimentalWarning: OptunaSearchCV is experimental (supported from v0.17.0). The interface can change in the future. [I 2024-05-05 17:38:25,527] A new study created in memory with name: no-name-331e97bf-b317-4c76-bec2-cd3522a0c2db [I 2024-05-05 17:38:29,864] Trial 0 finished with value: 0.5306501177224249 and parameters: {'max_depth': 8, 'max_features': 0.9702733153520945, 'max_samples': 0.9074425779535134, 'n_estimators': 100, 'min_samples_leaf': 5}. Best is trial 0 with value: 0.5306501177224249. [I 2024-05-05 17:38:37,582] Trial 1 finished with value: 0.5337859202195159 and parameters: {'max_depth': 8, 'max_features': 0.9338474739668312, 'max_samples': 0.8215447377447068, 'n_estimators': 200, 'min_samples_leaf': 5}. Best is trial 1 with value: 0.5337859202195159. [I 2024-05-05 17:38:50,174] Trial 2 finished with value: 0.5413948306595366 and parameters: {'max_depth': 15, 'max_features': 0.8903464722911035, 'max_samples': 0.9278083309755382, 'n_estimators': 200, 'min_samples_leaf': 11}. Best is trial 2 with value: 0.5413948306595366. [I 2024-05-05 17:38:54,237] Trial 3 finished with value: 0.49590502413711957 and parameters: {'max_depth': 4, 'max_features': 0.9301310566544819, 'max_samples': 0.9117587104662254, 'n_estimators': 200, 'min_samples_leaf': 13}. Best is trial 2 with value: 0.5413948306595366. [I 2024-05-05 17:39:03,268] Trial 4 finished with value: 0.539217663652851 and parameters: {'max_depth': 12, 'max_features': 0.8804876893436934, 'max_samples': 0.9778139136456508, 'n_estimators': 160, 'min_samples_leaf': 13}. Best is trial 2 with value: 0.5413948306595366. [I 2024-05-05 17:39:07,623] Trial 5 finished with value: 0.517807069057673 and parameters: {'max_depth': 6, 'max_features': 0.9660938319353911, 'max_samples': 0.9982479402176359, 'n_estimators': 140, 'min_samples_leaf': 13}. Best is trial 2 with value: 0.5413948306595366. [I 2024-05-05 17:39:14,661] Trial 6 finished with value: 0.5423700484706668 and parameters: {'max_depth': 14, 'max_features': 0.8550014411146729, 'max_samples': 0.8170177719291295, 'n_estimators': 120, 'min_samples_leaf': 10}. Best is trial 6 with value: 0.5423700484706668. [I 2024-05-05 17:39:25,561] Trial 7 finished with value: 0.5406308981071967 and parameters: {'max_depth': 14, 'max_features': 0.9360356069488821, 'max_samples': 0.9468759166231331, 'n_estimators': 170, 'min_samples_leaf': 13}. Best is trial 6 with value: 0.5423700484706668. [I 2024-05-05 17:39:37,412] Trial 8 finished with value: 0.5476691038115569 and parameters: {'max_depth': 15, 'max_features': 0.8655585875365044, 'max_samples': 0.94648604730726, 'n_estimators': 160, 'min_samples_leaf': 5}. Best is trial 8 with value: 0.5476691038115569. [I 2024-05-05 17:39:39,991] Trial 9 finished with value: 0.4898242952136411 and parameters: {'max_depth': 4, 'max_features': 0.8465340536312905, 'max_samples': 0.8586699915640894, 'n_estimators': 150, 'min_samples_leaf': 14}. Best is trial 8 with value: 0.5476691038115569.
RandomForest best_estimator: RandomForestClassifier(max_depth=15, max_features=0.8655585875365044,
max_samples=0.94648604730726, min_samples_leaf=5,
n_estimators=160, n_jobs=-1, random_state=33)
RandomForest cv_optimized_score: 0.5476691038115569
C:\Users\Aleksei_Bolshukhin\02_PASTM\AB\utils.py:199: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead. C:\Users\Aleksei_Bolshukhin\02_PASTM\AB\utils.py:207: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead. C:\Users\Aleksei_Bolshukhin\02_PASTM\AB\utils.py:189: ExperimentalWarning: OptunaSearchCV is experimental (supported from v0.17.0). The interface can change in the future. [I 2024-05-05 17:40:03,880] A new study created in memory with name: no-name-3443f15c-de97-4599-b672-07cb7c557f40 [I 2024-05-05 17:42:07,525] Trial 0 finished with value: 0.5207967050985796 and parameters: {'C': 4.904897007757462, 'kernel': 'linear'}. Best is trial 0 with value: 0.5207967050985796. [I 2024-05-05 17:43:10,550] Trial 1 finished with value: 0.5197540891233099 and parameters: {'C': 39.30157596759402, 'kernel': 'rbf'}. Best is trial 0 with value: 0.5207967050985796. [I 2024-05-05 17:44:09,478] Trial 2 finished with value: 0.5072786975149022 and parameters: {'C': 9.778992072823884, 'kernel': 'rbf'}. Best is trial 0 with value: 0.5207967050985796. [I 2024-05-05 17:45:55,066] Trial 3 finished with value: 0.5208368727820742 and parameters: {'C': 3.825695092140622, 'kernel': 'linear'}. Best is trial 3 with value: 0.5208368727820742. [I 2024-05-05 17:46:54,491] Trial 4 finished with value: 0.487172288011843 and parameters: {'C': 2.323618875052263, 'kernel': 'rbf'}. Best is trial 3 with value: 0.5208368727820742. [I 2024-05-05 17:47:58,430] Trial 5 finished with value: 0.521244197628081 and parameters: {'C': 48.05476019465585, 'kernel': 'rbf'}. Best is trial 5 with value: 0.521244197628081. [I 2024-05-05 17:50:29,594] Trial 6 finished with value: 0.5204929780911233 and parameters: {'C': 6.304900116292388, 'kernel': 'linear'}. Best is trial 5 with value: 0.521244197628081. [I 2024-05-05 18:10:25,654] Trial 7 finished with value: 0.5214531823759057 and parameters: {'C': 73.73629161174765, 'kernel': 'linear'}. Best is trial 7 with value: 0.5214531823759057. [I 2024-05-05 18:11:40,724] Trial 8 finished with value: 0.5208359635116904 and parameters: {'C': 1.8910416192325255, 'kernel': 'linear'}. Best is trial 7 with value: 0.5214531823759057. [I 2024-05-05 18:13:38,595] Trial 9 finished with value: 0.5208368727820742 and parameters: {'C': 3.8269487904390838, 'kernel': 'linear'}. Best is trial 7 with value: 0.5214531823759057.
SVC best_estimator: SVC(C=73.73629161174765, kernel='linear', random_state=33) SVC cv_optimized_score: 0.5214531823759057
C:\Users\Aleksei_Bolshukhin\02_PASTM\AB\utils.py:199: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead. C:\Users\Aleksei_Bolshukhin\02_PASTM\AB\utils.py:207: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.
# Metrics on TimeSeriesSplit
metrics_cv
| model | accuracy | precision | recall | f1 | |
|---|---|---|---|---|---|
| 0 | LogisticRegression | 0.845956 | 0.721423 | 0.430736 | 0.539312 |
| 1 | LGBMClassifier | 0.851035 | 0.714164 | 0.481288 | 0.575038 |
| 2 | RandomForest | 0.847240 | 0.720538 | 0.441715 | 0.547669 |
| 3 | SVC | 0.844087 | 0.729845 | 0.405811 | 0.521453 |
best_model = best_estimator['LGBMClassifier']
best_model.fit(X_train_transformed, y_train)
LGBMClassifier(bagging_fraction=0.9961972749738366,
feature_fraction=0.9777990040240463,
learning_rate=0.12339248863293056, max_depth=14,
min_child_samples=16, n_estimators=170, random_state=33)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. LGBMClassifier(bagging_fraction=0.9961972749738366,
feature_fraction=0.9777990040240463,
learning_rate=0.12339248863293056, max_depth=14,
min_child_samples=16, n_estimators=170, random_state=33)y_test_pred = best_model.predict(X_test_transformed)
# Metrics on test
utils.calc_metrics(test_df[target], y_test_pred, metrics)
{'accuracy': 0.8417806683112805,
'precision': 0.7152188112344873,
'recall': 0.5289855072463768,
'f1': 0.6081643987781172}
# Metrics on test - Naive model
utils.calc_metrics(y_test, X_test["RainToday"].fillna(0), metrics)
{'accuracy': 0.749607535321821,
'precision': 0.4606660231660232,
'recall': 0.46111111111111114,
'f1': 0.46088845968131337}
# ROC AUC Curve
utils.plot_roc_auc_curve(test_df[target].values, best_model.predict_proba(X_test_transformed)[:, 1])
# Confusion matrix
utils.calc_confusion_matrix(y_test, y_test_pred)
| Actual | 0 | 1 | All |
|---|---|---|---|
| Predicted | |||
| 0 | 12824 | 1950 | 14774 |
| 1 | 872 | 2190 | 3062 |
| All | 13696 | 4140 | 17836 |
# Permutation importances
result = permutation_importance(best_model, X_test_transformed, test_df[target], n_repeats=100, random_state=33, scoring='f1')
utils.plot_permutation_importances(result, features=categorical_features+numerical_features)
1) LGBMClassifier shows the best results on test set.
2) There is significant improvement of Naive model: 50% better precision of LGBMClassifier, but quite many cases of rain are still missed by model.
3) Humidity, Pressure and Wind Speed are the most important features.
1) Data processing:implement knowledge about temporal-geospatial data distribution.
2) Feature engineering:lagged and rolling window features,geospatial features,the weather predictions of other models.
3) Modeling approches:DL models (Deep GPVAR,STCN),separated models for the top and the flop “rainy” groups of locations.